Introduction

In this project, we used several functions in tnum package to make a simple text analysis on Jane Austen’s Sense and sensibility. We search for sentences with Elinor and Marianne then tagged them with "Group7_*". We also added a new chapter column in our data frame to show exact chapter number and count frequency.

Sense and sensibility

tnum.authorize(ip = "54.158.136.133")
## Available spaces: testspace, MEPED, alion-rf, shared-testspace, MSSP-1, wintergreen, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, testspace, shared-testspsace, hared-testspace
## Numberspace set to: shared-testspace
# phrase list: level 4, to chapter
chapterList <- tnum.getDatabasePhraseList("subject", levels = 4)
chapterPlot <- tnum.makePhraseGraphFromPathList(chapterList)
tnum.plotGraph(chapterPlot)
# find Elinor and Marianne in Sense and Sensibility
# Elinor appears in 653 sentences
elinorSentence <- tnum.query("*sense* has text =REGEXP(\"Elinor\")", max=700)
## Returned 1 thru 653 of 653 results
elinorText <- tnum.objectsToDf(elinorSentence)
elinorText
# Tag these sentences with "Group7_Elinor"
tnum.tagByQuery("*sense* has text =REGEXP(\"Elinor\")","Group7_Elinor")
## list(modifiedCount = 653, tagged = 653, removed = 0)
# Marianne appears in 524 sentences
marianneSentence <- tnum.query("*sense* has text =REGEXP(\"Marianne\")", max = 600)
## Returned 1 thru 524 of 524 results
marianneText <- tnum.objectsToDf(marianneSentence)
marianneText
# Tag these sentences with "Group7_Marianne"
tnum.tagByQuery("*sense* has text =REGEXP(\"Marianne\")","Group7_Marianne")
## list(modifiedCount = 524, tagged = 524, removed = 0)
# Extract the 148 sentenses where Elinor and Marianne both appears
emSentence <- tnum.query("@[Group7_Elinor,Group7_Marianne]")
## Returned 1 thru 10 of 148 results
# Plot the path of tagged sentences
emText <- tnum.objectsToDf(emSentence)
emPlot <- tnum.makePhraseGraphFromPathList(emText$subject)
tnum.plotGraph(emPlot)
# Make a wordcloud showing words appear most in sentences with Elinor
elinorTextSub <- elinorText %>%
  anti_join(emText, by="subject")

elinorWords <- elinorTextSub %>%
  unnest_tokens(word, string.value) %>%
  filter(is.na(as.double(word))) %>%
  anti_join(stop_words)
## Joining, by = "word"
elinorWordCount <- elinorWords %>%
  count(word, sort=TRUE) %>%
  filter(word != "elinor")

wordcloud(
  word=elinorWordCount$word,
  freq=elinorWordCount$n,
  max.words=70
)

# Make a wordcloud showing words appear most in sentences with Marianne
marianneTextSub <- marianneText %>%
  anti_join(emText, by="subject")

marianneWords <- marianneTextSub %>%
  unnest_tokens(word, string.value) %>%
  filter(is.na(as.double(word))) %>%
  anti_join(stop_words)
## Joining, by = "word"
marianneWordCount <- marianneWords %>%
  count(word, sort=TRUE) %>%
  filter(word != "marianne")

wordcloud(
  word=marianneWordCount$word,
  freq=marianneWordCount$n,
  max.words=70
)

# Do a simple sentiment analysis comparing Elinor and Marianne's sentences
elinorSentiment <- elinorWords %>%
  inner_join(get_sentiments("nrc")) 
## Joining, by = "word"
elinorSentimentCount <- elinorSentiment %>%
  count(sentiment, sort=TRUE)

plotElinor <- ggplot(elinorSentimentCount, aes(sentiment, n, fill=n))+
  geom_col()+
  labs(
    title="Sentiment analysis for sentences with Elinor",
    x="Sentiment",
    y=""
  )

marianneSentiment <- marianneWords %>%
  inner_join(get_sentiments("nrc")) 
## Joining, by = "word"
marianneSentimentCount <- marianneSentiment %>%
  count(sentiment, sort=TRUE)

plotMarianne <- ggplot(marianneSentimentCount, aes(sentiment, n, fill=n))+
  geom_col()+
  labs(
    title="Sentiment analysis for sentences with Marianne",
    x="Sentiment",
    y=""
  )

grid.arrange(plotElinor, plotMarianne, nrow=2)

# Count word in 4210 sentences
wordCount <- tnum.query("*sense# has count:word", max=5000)
## Returned 1 thru 4210 of 4210 results
# Show word count in the first 10 sentences
wordCount[1:10]
## [[1]]
## [1] 9
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-1/sentence-1"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "e1ff24af-a953-4317-a2e5-3b303210ccfc"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[2]]
## [1] 39
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-1/sentence-2"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "a262aa3f-d6ff-4190-982e-ad5cfcb81385"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[3]]
## [1] 33
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-1/sentence-3"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "7bd4c1e0-9089-473f-93bf-d2f326d846f1"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[4]]
## [1] 53
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-1/sentence-4"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "0335a267-d6dd-406e-81a6-809794e9e199"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[5]]
## [1] 17
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-1/sentence-5"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "fefec750-b00f-4849-99a0-a939d0d98e5f"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[6]]
## [1] 5
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-1/sentence-6"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "4961053f-dc89-4e47-b591-97bd3bad4432"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[7]]
## [1] 46
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-1/sentence-7"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "fabe9ac3-c42d-4029-978d-bb578197e9ae"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[8]]
## [1] 15
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-2/sentence-1"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "fadba547-1b30-4f80-a069-5328b0f3a222"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[9]]
## [1] 32
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-2/sentence-2"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "61da01cc-d361-4e84-9620-09869c8a999d"
## attr(,"date")
## [1] "2020-11-16"
## 
## [[10]]
## [1] 13
## attr(,"tags")
## list()
## attr(,"class")
## [1] "tnum"
## attr(,"subject")
## [1] "austen:jane:sense_and_sensibility/chapter-1/paragraph-2/sentence-3"
## attr(,"property")
## [1] "count:word"
## attr(,"guid")
## [1] "a57f65c8-7a6f-4287-b913-cb3402243a93"
## attr(,"date")
## [1] "2020-11-16"
# Plot a histogram showing the distribution of number of words per sentence
hist(as.numeric(wordCount))

# Mark each sentence with its ordinal number
ordinalNum <- tnum.query("*sense* has ordinal", max=5000)
## Returned 1 thru 4210 of 4210 results
# Scatterplot of ordinal number versus word count
df <- data_frame(ordinalNumber=as.numeric(ordinalNum), wordCount=as.numeric(wordCount))
ggplot(df, aes(ordinalNumber, wordCount))+
  geom_point(position="jitter", alpha=0.3)+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Data frame of sentences with its ordinal number
sentenceWord <- tnum.objectsToDf(ordinalNum)
# Data frame for each sentence and its chapter number
df.cha <- data.frame(subject=1:length(ordinalNum), chapter=0)
for (i in 1:length(ordinalNum)){
  df.cha$chapter[i] <- as.numeric(substring(str_split(tnum.getAttrFromList(ordinalNum[i], "subject"), "[:/]")[[1]][4], 9))
}
# Add a chapter number column to sentenceWord data frame
sentenceWord <- mutate(sentenceWord,chapter=df.cha$chapter)
head(sentenceWord)
# Histograph showing number of sentences in every chapter
ggplot(sentenceWord,aes(as.factor(chapter)))+
  geom_histogram(bins=30, stat="count")+
  labs(
    title="Subjects number of each chapter",
    x="Chapter Number"
  )

# character sentiment
elinorWords$character <- "Elinor"
marianneWords$character <- "Marianne"
characterWords <- rbind(elinorWords, marianneWords)

sentimentScore <- characterWords %>%
  inner_join(get_sentiments("afinn")) 
## Joining, by = "word"
sentenceSentiment <- sentimentScore %>%
  group_by(subject,character) %>%
  summarize(overall_sentiment=sum(value))
## `summarise()` regrouping output by 'subject' (override with `.groups` argument)
sentenceSentiment <- left_join(sentenceSentiment, sentenceWord, by="subject")

ggplot(sentenceSentiment, aes(numeric.value, overall_sentiment, color=character))+
  geom_smooth(show.legend= TRUE, se=FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  # facet_wrap(~character)
# tnum.deleteByQuery("*sense* has text =REGEXP(\"Elinor\")", max=700)
# tnum.deleteByQuery("*sense* has text =REGEXP(\"Marianne\")", max = 600)
# tnum.deleteByQuery("@[Group7_Elinor,Group7_Marianne]")
# tnum.deleteByQuery("*sense# has count:word", max=5000)
# tnum.deleteByQuery("*sense* has ordinal", max=5000)